library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rvest)
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
Create a function that given the Amazon product ID retrives some valuable information (like product details and number of customer ratings)
amazon_product_info <- function(id) {
url <- paste0("https://www.amazon.co.uk/dp/", id)
html <- read_html(url)
# product details no rank nor n. reviews
product_details = html %>%
html_element("#detailBullets_feature_div") %>%
html_element("[class='a-unordered-list a-nostyle a-vertical a-spacing-none detail-bullet-list']") %>%
html_text2()
# only the number of customers ratings
number_of_ratings = html %>%
html_element("#acrCustomerReviewText") %>%
html_text2()
# Return a tibble
tibble(product_details, number_of_ratings) %>%
return()
}
After choosing a product, we use the previous function to obtain information about it.
id_prod = "0099908506" # id product
prod_info = amazon_product_info(id_prod)
prod_info
## # A tibble: 1 × 2
## product_details number_of_ratin…
## <chr> <chr>
## 1 "ASIN : 0099908506\nPublisher : Arrow; New Ed edition (1… 5,949 ratings
Create a function to obtain the product reviews (title, text, review stars), considering both UK reviews and not from UK ones.
amazon_reviews <- function(id, page) {
url <- paste0("https://www.amazon.co.uk/product-reviews/", # url using id and page variables
id, "/?pageNumber=", page)
html <- read_html(url)
# Review title (UK and not-UK)
title = html %>%
html_elements("[class='a-size-base a-link-normal review-title a-color-base review-title-content a-text-bold']") %>%
html_text2()
title = title %>%
c(html %>%
html_elements("[class='a-size-base review-title a-color-base review-title-content a-text-bold']") %>%
html_text2())
# Review text (the same for UK and not-UK)
text = html %>%
html_elements("[class='a-size-base review-text review-text-content']") %>%
html_text2()
# Review stars (UK and not-UK)
star = html %>%
html_elements("[data-hook='review-star-rating']") %>%
html_text2()
star = star %>%
c(html %>%
html_elements("[data-hook='cmps-review-star-rating']") %>%
html_text2())
# Return a tibble
tibble(title, text, star, page = page) %>%
return()
}
With map_df function from the purrr package we can iterate the task over multiple pages to create a dataframe.
library(purrr)
page = 1:30
prod_rev = map_df(page, ~amazon_reviews(id_prod, page = .))
prod_rev$doc_id = 1:nrow(prod_rev) # we also add a doc_id and we save the results
head(prod_rev)
## # A tibble: 6 × 5
## title text star page doc_id
## <chr> <chr> <chr> <int> <int>
## 1 Flawed, dated, but still brilliant "I f… 5.0 … 1 1
## 2 Alcoholic's Twaddle "My … 2.0 … 1 2
## 3 I suppose ol’ Ernie got better after this… "Thi… 2.0 … 1 3
## 4 An Old Classic That Stands the Test of Time "It … 5.0 … 1 4
## 5 Moonshine "An … 3.0 … 1 5
## 6 Heavy on style, light on plot, light on fidelity to … "Par… 3.0 … 1 6
Consider only English written reviews
library(cld2) # if the language cannot be determined it returns NA.
prod_rev$title_lang = detect_language(prod_rev$title)
prod_rev$text_lang = detect_language(prod_rev$text)
table(prod_rev$text_lang, prod_rev$title_lang, useNA = "always") # compare the results using table
##
## de en es fr mg pt <NA>
## ca 0 0 0 0 0 1 0
## de 1 1 0 0 0 0 1
## en 0 168 0 0 1 0 82
## es 0 1 2 0 0 0 3
## fr 0 1 0 5 0 0 3
## it 0 1 0 0 0 0 2
## pt 0 0 1 0 0 2 0
## <NA> 0 0 0 0 0 0 24
prod_rev = prod_rev %>%
filter(text_lang == "en") # select only reviews in english
prod_rev
## # A tibble: 251 × 7
## title text star page doc_id title_lang text_lang
## <chr> <chr> <chr> <int> <int> <chr> <chr>
## 1 Flawed, dated, but still brill… "I f… 5.0 … 1 1 en en
## 2 Alcoholic's Twaddle "My … 2.0 … 1 2 en en
## 3 I suppose ol’ Ernie got better… "Thi… 2.0 … 1 3 en en
## 4 An Old Classic That Stands the… "It … 5.0 … 1 4 en en
## 5 Moonshine "An … 3.0 … 1 5 <NA> en
## 6 Heavy on style, light on plot,… "Par… 3.0 … 1 6 en en
## 7 A VINTAGE NOVEL "Thi… 4.0 … 1 7 <NA> en
## 8 Nothing happens "A s… 2.0 … 1 8 en en
## 9 Tedious, lacking direction and… "I'v… 1.0 … 1 9 en en
## 10 More style than subsance "A h… 4.0 … 1 10 en en
## # … with 241 more rows
Extract a numeric score from the stars string
# Convert stars from string to numeric
prod_rev = prod_rev %>%
mutate(score = as.numeric(substring(star, 1, 1)))
summary(prod_rev$score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 3.0 4.0 3.9 5.0 5.0
# Compute distribution of stars + visualization
prod_rev %>%
count(score) %>%
mutate(p = round(n/sum(n), 2))
## # A tibble: 5 × 3
## score n p
## <dbl> <int> <dbl>
## 1 1 18 0.07
## 2 2 25 0.1
## 3 3 39 0.16
## 4 4 51 0.2
## 5 5 118 0.47
prod_rev %>%
ggplot(aes(x = score)) + geom_bar(aes(y = (..count..)), fill = "steelblue") +
labs(title = "Amazon reviews' stars", x = "Stars", y = "Number of comments") +
theme_bw() +
theme(plot.title = element_text(color = "steelblue", size = 12, face = "bold"),
plot.subtitle = element_text(color = "steelblue2"))
It appears that positive reviews prevail. From the 5 class score, we can tranform it to a binary classification: if the reviews has 4 or 5 starts it is positive, otherwise it is negative.
# Binary variable creation
prod_rev = prod_rev %>%
mutate(star_sent = ifelse(star>=4, "positive", "negative"))
# Binary variable's distribution
prod_rev %>%
count(star_sent) %>%
mutate(p = round(n/sum(n), 2))
## # A tibble: 2 × 3
## star_sent n p
## <chr> <int> <dbl>
## 1 negative 82 0.33
## 2 positive 169 0.67
prod_rev$nchar = str_length(prod_rev$text)
ggplot(prod_rev, aes(x = star_sent, y = nchar, fill = star_sent)) +
geom_boxplot() +
theme_bw() +
scale_fill_manual(values = c("steelblue", "skyblue"))
In order to conduct a better analysis we need to clean the text data making it easier to work with. Stop-words (customized in our case), upper-case letters, punctuaction and digits are dropped.
We create are own custom-stopwords because we have a problem with the ’ symbol (it isn’t detect when it appear as ’) with the filtering option we would delete even some non-stop-words so we create custom_stopwords.
library(tidytext)
# Create our custom stop-words
custom_stopwords = bind_rows(
tibble(word = c(
"t’s","i’m","you’re","he’s","she’s","it’s","we’re","they’re","i’ve","you’ve","we’ve","they’ve","i’d","you’d","he’d","she’d",
"we’d","they’d","i’ll","you’ll","he’ll", "she’ll","we’ll","they’ll","isn’t","aren’t","wasn’t","weren’t","hasn’t",
"haven’t","hadn’t","doesn’t","don’t","didn’t","won’t","wouldn’t","shan’t","shouldn’t","can’t","cannot","couldn’t","mustn’t",
"let’s","that’s","who’s", "what’s","here’s","there’s","when’s","where’s","why’s","how’s","a’s","ain’t", "c’s","c’mon"),
lexicon = "custom"), stop_words)
# Filter out unwanted words and symbols
tidy_text = prod_rev %>%
unnest_tokens(word, text) %>%
anti_join(custom_stopwords) %>%
filter(!str_detect(word, "^([[:digit:]]+)$")) %>% # filter for numbers (~130 words)
filter(!str_detect(word, "^([[:alnum:]]+)[.,]([[:alnum:]]+)")) # filter for numbers with decimal (few words)
## Joining, by = "word"
# + word.word(mistakes in punctuation)(~300 words)
# Look at some frequent terms
freq.df = tidy_text %>%
count(word, sort = T)
head(freq.df, 20)
## # A tibble: 20 × 2
## word n
## <chr> <int>
## 1 book 193
## 2 hemingway 174
## 3 read 125
## 4 brett 94
## 5 jake 85
## 6 characters 72
## 7 paris 71
## 8 story 69
## 9 time 67
## 10 hemingway's 56
## 11 reading 56
## 12 spain 55
## 13 style 55
## 14 love 53
## 15 war 50
## 16 pamplona 46
## 17 life 45
## 18 bull 42
## 19 fiesta 39
## 20 writing 39
For word normalization we could use either stemming or lemmatization. The goal of both methods is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form. For our analysis we use Stemming, which is the process of reducing the word to its root eliminating the suffix.
# STEMMING
library(SnowballC)
tidy_stem = tidy_text %>%
mutate(word = wordStem(word))
# LEMMATIZATION
library(udpipe)
## Warning: package 'udpipe' was built under R version 4.0.5
tidy_lemma <- udpipe(prod_rev, "english-gum")
tidy_lemma = tidy_lemma %>%
mutate(stem = wordStem(token)) %>%
tibble()
tidy_lemma # table and the differences between token (word) lemmas and stems:
## # A tibble: 26,236 × 18
## doc_id paragraph_id sentence_id sentence start end term_id token_id token
## <chr> <int> <int> <chr> <int> <int> <int> <chr> <chr>
## 1 1 1 1 I finishe… 1 1 1 1 I
## 2 1 1 1 I finishe… 3 10 2 2 fini…
## 3 1 1 1 I finishe… 12 15 3 3 this
## 4 1 1 1 I finishe… 17 21 4 4 novel
## 5 1 1 1 I finishe… 23 25 5 5 for
## 6 1 1 1 I finishe… 27 29 6 6 the
## 7 1 1 1 I finishe… 31 36 7 7 seco…
## 8 1 1 1 I finishe… 38 41 8 8 time
## 9 1 1 1 I finishe… 43 46 9 9 last
## 10 1 1 1 I finishe… 48 52 10 10 night
## # … with 26,226 more rows, and 9 more variables: lemma <chr>, upos <chr>,
## # xpos <chr>, feats <chr>, head_token_id <chr>, dep_rel <chr>, deps <chr>,
## # misc <chr>, stem <chr>
tidy_lemma %>%
select(token, lemma, stem)
## # A tibble: 26,236 × 3
## token lemma stem
## <chr> <chr> <chr>
## 1 I I I
## 2 finished finish finish
## 3 this this thi
## 4 novel novel novel
## 5 for for for
## 6 the the the
## 7 second second second
## 8 time time time
## 9 last last last
## 10 night night night
## # … with 26,226 more rows
We first consider the tidy approach, where we consider words as tokens. With this SA approach, we will use three lexicons: BING (gives words a positive or negative sentiment), AFINN (rates words with a value from -5 to +5), and NRC (labels words with six possible sentiments or emotions).The procedure for each of these lexicons is similar, but the results are dependent on the lexicon itself. With every specific lexicon, we are able to give a sentiment or value to (almost) each word, and then we compute the value of each review as an aggregation of the contained words’ values/sentiment. We later plot our results using histograms.
bing = get_sentiments("bing")
# Get sentiment score
prod_rev_bing = tidy_text %>%
select(doc_id, word) %>%
inner_join(bing) %>%
count(doc_id, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(bing = positive - negative)
## Joining, by = "word"
prod_rev = prod_rev %>%
left_join(prod_rev_bing %>%
select(doc_id, bing))
## Joining, by = "doc_id"
hist(prod_rev$bing, col = "red", main = "Sentiment distribution - tidy- bing lexicon")
summary(prod_rev$bing)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -25.0000 -1.0000 0.0000 -0.3005 1.5000 12.0000 48
# Analyze different words' contribution to the sentiment.
bing_word_counts <- tidy_text %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 5, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = F) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment", y = NULL) +
theme_bw() + scale_fill_manual(values = c("steelblue","skyblue"))
We can also plot a word-cloud. The color represent the sentiment associated to a particular word, while the size of each word depends on the its frequency.
library(wordcloud)
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 4.0.5
library(wordcloud2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_text %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("orangered", "darkgreen"), max.words = 100)
## Joining, by = "word"
We conduct the same analysis as before, using the AFINN lexicon (need for few arrangments).
afinn = get_sentiments("afinn")
# Get sentiment score
prod_rev_afinn = tidy_text %>%
select(doc_id, word) %>%
inner_join(afinn) %>%
group_by(doc_id) %>%
summarise(afinn = sum(value))
## Joining, by = "word"
prod_rev = prod_rev %>%
left_join(prod_rev_afinn %>%
select(doc_id, afinn))
## Joining, by = "doc_id"
hist(prod_rev$afinn, col = "blue", main = "Sentiment distribution - tidy - afinn lexicon")
summary(prod_rev$afinn)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -26.000 -2.000 3.000 1.753 5.000 30.000 69
# Let's see the contribution of words to the sentiment.
afinn_word_counts <- tidy_text %>%
inner_join(get_sentiments("afinn")) %>%
count(word, value, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
afinn_word_counts %>%
group_by(value) %>%
slice_max(n, n = 5, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = value)) + geom_col(show.legend = F) +
facet_wrap(~value, scales = "free_y") + labs(x = "Contribution to sentiment",
y = NULL)
We conduct the same analysis as before, using the NRC lexicon (need for few arrangments).
nrc = get_sentiments("nrc")
# Get sentiment score
prod_rev_nrc = tidy_text %>%
select(doc_id, word) %>%
inner_join(nrc) %>%
count(doc_id, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(nrc = positive - negative)
## Joining, by = "word"
prod_rev = prod_rev %>%
left_join(prod_rev_nrc %>%
select(doc_id, nrc))
## Joining, by = "doc_id"
hist(prod_rev$nrc, col = "yellow", main = "Sentiment distribution - tidy - nrc lexicon")
summary(prod_rev$nrc)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -10.000 0.000 1.000 2.363 4.000 28.000 36
# Let's see the contribution of words to the sentiment.
nrc_word_counts <- tidy_text %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
nrc_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 5, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") + labs(x = "Contribution to sentiment",
y = NULL) + theme_bw()
Lexicons histogram - comparing sentiment distribution using different lexicons
prod_rev %>%
ggplot() +
geom_histogram(aes(x = bing, fill = "b"), bins = 40, alpha = 0.5) +
geom_histogram(aes(x = afinn, fill = "a"), bins = 40, alpha = 0.5) +
geom_histogram(aes(x = nrc, fill = "n"), bins = 40 , alpha = 0.5) +
scale_fill_manual(name ="lexicon", values = c("b" = "red", "a" = "blue", "n" = "yellow"),
labels=c("b" = "bing", "a" = "afinn", "n" = "nrc")) +
labs(title= "Sentiment Distribution using all 3 lexicons", y = "Frequency", x = "Sentiment")
## Warning: Removed 48 rows containing non-finite values (stat_bin).
## Warning: Removed 69 rows containing non-finite values (stat_bin).
## Warning: Removed 36 rows containing non-finite values (stat_bin).
Word count sentiments - compare most common positive/negative words
(considering different lexicons)
#BING
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 5, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = F) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment - BING", y = NULL) +
theme_bw() + scale_fill_manual(values = c("red4","red"))
# AFINN
afinn_word_counts %>%
group_by(sentiment = ifelse(value>0, "positive", "negative")) %>%
slice_max(n, n = 5, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = F) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment - AFINN", y = NULL) +
theme_bw() + scale_fill_manual(values = c("steelblue","skyblue"))
# NRC
nrc_word_counts %>%
filter(sentiment %in% c("positive", "negative")) %>%
group_by(sentiment) %>%
slice_max(n, n = 5, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") + labs(x = "Contribution to sentiment - NRC", y = NULL) + theme_bw() +
scale_fill_manual(values = c("goldenrod3","gold"))
With this approach, we also consider polarity negators and polarity amplifiers (we will consider the previous 2 words, not following words). The performance increases when we consider them both. However, also this approach is not free from possible problems, there is some situation in which the approach under-perform the previous one. We can use lemmas or words in the analysis and we can use one from the three lexicons (will not drop stop-words).
library(udpipe)
data_udpipe <- udpipe(prod_rev, "english-gum")
bing_dict = get_sentiments("bing") %>%
mutate(sentiment = ifelse(sentiment == "negative", -1, 1)) %>%
rename(term = "word", polarity = "sentiment")
scores_b <- txt_sentiment(x = data_udpipe,
term = "lemma", #in this case we use lemmas instead of words
polarity_terms = bing_dict, #we also not dropping stop-words
polarity_negators = "not", #there 'll be some difference
polarity_amplifiers = "very",
n_before = 2,
n_after = 0,
constrain = F)
prod_rev$udpipe_bing_l = scores_b$overall$sentiment_polarity
summary(prod_rev$udpipe_bing_l)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -15.0000 0.0000 1.0000 0.9665 2.0000 14.0000
summary(prod_rev$bing)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -25.0000 -1.0000 0.0000 -0.3005 1.5000 12.0000 48
# Compare distributions between the tidy approach using Bing and the udpipe approach using the same lexicon (and lemmas)
par(mfrow = c(1, 2))
hist(scale(prod_rev$bing), col = "lightblue", main = "Sentiment distribution-bing")
hist(scale(prod_rev$udpipe_bing_l), col = "lightblue", main = "udpipe (bing dict-lemmas)")
scores_c <- txt_sentiment(x = data_udpipe,
term = "token", #in this case we use lemmas instead of words
polarity_terms = bing_dict, #we also not dropping stop-words
polarity_negators = c("not"), #there'll be some difference
polarity_amplifiers = c("very"),
n_before = 2,
n_after = 0,
constrain = F)
prod_rev$udpipe_bing_w = scores_c$overall$sentiment_polarity
summary(prod_rev$udpipe_bing_w)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11.2000 0.0000 1.0000 0.9474 2.0000 13.0000
# Compare distributions between the tidy approach using Bing, the udpipe approach with lemmas, and the udpipe approach with words
par(mfrow = c(1, 3))
hist(scale(prod_rev$bing), col = "lightblue", main = "Sentiment distribution - bing") # tidy approach
hist(scale(prod_rev$udpipe_bing_w), col = "lightblue", main = "udpipe (bing dict) - words") # udpipe approach with words
hist(scale(prod_rev$udpipe_bing_l), col = "lightblue", main = "udpipe (bing dict-lemmas)") # udpipe with lemmas
We can repeat all this process for all the other lexicons.
afinn_dict = get_sentiments("afinn") %>%
rename(term = "word", polarity = "value")
data_udpipe <- udpipe(prod_rev, "english-gum")
scores_a <- txt_sentiment(x = data_udpipe,
term = "lemma", #in this case we use lemmas instead of words
polarity_terms = afinn_dict, #we also not dropping stop-words
polarity_negators = c("not"), #there'll be some difference
polarity_amplifiers = c("very"),
n_before = 2,
n_after = 0,
constrain = F)
prod_rev$udpipe_afinn_l = scores_a$overall$sentiment_polarity
hist(prod_rev$udpipe_afinn_l, col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - lemmas)")
summary(prod_rev$udpipe_afinn_l)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -13.000 0.000 3.000 4.683 6.800 52.600
summary(prod_rev$afinn)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -26.000 -2.000 3.000 1.753 5.000 30.000 69
# Compare distributions between the tidy approach using Afinn and the udpipe approach using the same lexicon (and lemmas)
par(mfrow = c(1, 2))
hist(scale(prod_rev$afinn), col = "lightblue", main = "Sentiment distribution - afinn")
hist(scale(prod_rev$udpipe_afinn_l), col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - lemmas)")
scores_a <- txt_sentiment(x = data_udpipe,
term = "token", #in this case we use lemmas instead of words
polarity_terms = afinn_dict, #we also not dropping stop-words
polarity_negators = c("not"), #there'll be some difference
polarity_amplifiers = c("very"),
n_before = 2,
n_after = 0,
constrain = F)
prod_rev$udpipe_afinn_w = scores_a$overall$sentiment_polarity
hist(prod_rev$udpipe_afinn_w, col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - words)")
summary(prod_rev$udpipe_afinn_w)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -13.00 0.00 3.00 4.24 6.00 57.00
# Compare distributions between the tidy approach using Afinn, the udpipe approach with lemmas, and the udpipe approach with words
par(mfrow = c(1, 3))
hist(scale(prod_rev$afinn), col = "lightblue", main = "Sentiment distribution - afinn")
hist(scale(prod_rev$udpipe_afinn_l), col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - lemmas)")
hist(scale(prod_rev$udpipe_afinn_w), col = "lightblue", main = "Sentiment distribution - udpipe (afinn dict - words)")
nrc_dict = get_sentiments("nrc") %>%
mutate(sentiment = ifelse(sentiment == "negative", -1, 1)) %>%
rename(term = "word", polarity = "sentiment")
data_udpipe <- udpipe(prod_rev, "english-gum")
scores_n <- txt_sentiment(x = data_udpipe,
term = "lemma", #in this case we use lemmas instead of words
polarity_terms = nrc_dict, #we also not dropping stop-words
polarity_negators = c("not"), #there'll be some difference
polarity_amplifiers = c("very"),
n_before = 2,
n_after = 0,
constrain = F)
prod_rev$udpipe_nrc_l = scores_n$overall$sentiment_polarity
summary(prod_rev$udpipe_nrc_l)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.000 1.000 3.000 7.363 6.500 79.000
summary(prod_rev$nrc)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -10.000 0.000 1.000 2.363 4.000 28.000 36
# Compare distributions between the tidy approach using NRC and the udpipe approach using the same lexicon (and lemmas)
par(mfrow = c(1, 2))
hist(scale(prod_rev$nrc), col = "lightblue", main = "Sentiment distribution - nrc")
hist(scale(prod_rev$udpipe_nrc_l), col = "lightblue", main = "Sentiment distribution - udpipe (nrc dict - lemmas)")
scores_n <- txt_sentiment(x = data_udpipe,
term = "token", #in this case we use lemmas instead of words
polarity_terms = nrc_dict, #we also not dropping stop-words
polarity_negators = c("not"), #there'll be some difference
polarity_amplifiers = c("very"),
n_before = 2,
n_after = 0,
constrain = F)
prod_rev$udpipe_nrc_w = scores_n$overall$sentiment_polarity
hist(prod_rev$udpipe_nrc_w, col = "lightblue", main = "Sentiment distribution - udpipe (nrc dict - words)")
summary(prod_rev$udpipe_nrc_w)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.000 1.000 2.000 6.268 6.400 61.000
# Compare distributions between the tidy approach using Afinn, the udpipe approach with lemmas, and the udpipe approach with words
par(mfrow = c(1, 3))
hist(scale(prod_rev$nrc), col = "lightblue", main = "Sentiment distribution - nrc")
hist(scale(prod_rev$udpipe_nrc_l), col = "lightblue", main = "Sentiment distribution - udpipe (nrc dict - lemmas)")
hist(scale(prod_rev$udpipe_nrc_w), col = "lightblue", main = "Sentiment distribution - udpipe (nrc dict - words)")
# BING
prod_rev %>%
select(doc_id, star_sent, udpipe_bing_l, bing) %>%
mutate(star_sent = ifelse(star_sent == "positive", 1, -1),
udpipe_bing_l = ifelse(udpipe_bing_l > 0, 1, ifelse(udpipe_bing_l < 0, -1, 0)),
bing = ifelse(bing > 0, 1, ifelse(bing < 0, -1, 0)), bing = replace_na(bing, 0)
) %>%
pivot_longer(cols = c("star_sent", "udpipe_bing_l", "bing")) %>%
ggplot(aes(doc_id, value, fill = name)) + geom_col(show.legend = FALSE) +
facet_wrap(~name, ncol = 1, scales = "free_y", strip.position = "right") +
theme_bw() + scale_fill_manual(values = c("deepskyblue2", "steelblue", "deepskyblue2")) + ggtitle('Compare: Tidy SA, Udpipe SA with lemmas, and Reviews stars. (Using BING)')
# AFINN
prod_rev %>%
select(doc_id, star_sent, udpipe_afinn_l, afinn) %>%
mutate(star_sent = ifelse(star_sent == "positive", 1, -1),
udpipe_afinn_l = ifelse(udpipe_afinn_l > 0, 1, ifelse(udpipe_afinn_l < 0, -1, 0)),
afinn = ifelse(afinn > 0, 1, ifelse(afinn < 0, -1, 0)), afinn = replace_na(afinn, 0)) %>%
pivot_longer(cols = c("star_sent", "udpipe_afinn_l", "afinn")) %>%
ggplot(aes(doc_id, value, fill = name)) + geom_col(show.legend = FALSE) +
facet_wrap(~name, ncol = 1, scales = "free_y", strip.position = "right") +
theme_bw() + scale_fill_manual(values = c("deepskyblue2", "steelblue", "deepskyblue2")) + ggtitle('Compare: Tidy SA, Udpipe SA with lemmas, and Reviews stars. (Using AFINN)')
#NRC
prod_rev %>%
select(doc_id, star_sent, udpipe_nrc_l, nrc) %>%
mutate(star_sent = ifelse(star_sent == "positive", 1, -1),
udpipe_nrc_l = ifelse(udpipe_nrc_l > 0, 1, ifelse(udpipe_nrc_l < 0, -1, 0)),
nrc = ifelse(nrc > 0, 1, ifelse(nrc < 0, -1, 0)), nrc = replace_na(nrc, 0)) %>%
pivot_longer(cols = c("star_sent", "udpipe_nrc_l", "nrc")) %>%
ggplot(aes(doc_id, value, fill = name)) + geom_col(show.legend = FALSE) +
facet_wrap(~name, ncol = 1, scales = "free_y", strip.position = "right") +
theme_bw() + scale_fill_manual(values = c("deepskyblue2", "steelblue", "deepskyblue2"))+ ggtitle('Compare: Tidy SA, Udpipe SA with lemmas, and Reviews stars. (Using NRC)')
In all these cases there are some differences. We can also compare the sentiments with the star score (pretending that it is the true one). Notice how these results strongly depends on the pre-pocessing phase (for the tidy approach we eliminated stropwords, for the udpipe one we considered lemmas instead of words and we didn’t remove stopwords).
We start by looking at the most frequent stems in the whole corpus (all the documents).
tidy_stem %>%
count(word) %>%
slice_max(n, n = 10) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = word)) + geom_bar(stat = "identity", fill = "skyblue") +
xlab(NULL) + labs(title = "Most common stems in reviews", y = "Stems count") +
theme(legend.position = "none", plot.title = element_text(color = "steelblue", size = 12, face = "bold")) +
coord_flip() + theme_bw()
Then, we can compare the stems used by people who wrote positive and negative reviews respectively.
tidy_stem %>%
group_by(star_sent) %>%
count(word) %>%
group_by(star_sent) %>%
slice_max(n, n = 10, with_ties = F) %>%
mutate(star_sent = as.factor(star_sent), word = reorder_within(word,n, star_sent)) %>%
ggplot(aes(word, n, fill = star_sent)) +
geom_col(show.legend = FALSE) +
facet_wrap(~star_sent, scales = "free_y") +
coord_flip() +
labs(title = "Most common stems in positive/negative reviews",y = NULL, x = "N") +
scale_x_reordered() + theme(legend.position = "none",plot.title = element_text(color = "orangered", "dodgerblue")) +
scale_fill_manual(values = c("orangered", "dodgerblue")) + theme_bw()
In order to show which stems are important but specific to each cateogory we can provide different visualization/scores.
We use a geom_jitter to compare the frequency of stems in positive and negative comments. The stems which lie near to the red line are used with about the same frequency in the two categories.
tidy_stem %>%
group_by(star_sent) %>%
count(word, sort = T) %>%
mutate(prop = n/sum(n)) %>%
select(star_sent, word, prop) %>%
pivot_wider(names_from = star_sent, values_from = prop) %>%
arrange(positive, negative) %>%
ggplot(aes(positive, negative)) +
geom_jitter(alpha = 0.5,size = 2.5, width = 0.25, height = 0.25, colour = "steelblue") +
geom_text(aes(label = word), check_overlap = T, vjust = 0) +
scale_x_log10() +
scale_y_log10() +
geom_abline(color = "red") + theme_bw()
## Warning: Removed 1799 rows containing missing values (geom_point).
## Warning: Removed 1799 rows containing missing values (geom_text).
word_ratios <- tidy_stem %>%
count(word, star_sent) %>%
group_by(word) %>%
filter(sum(n) >= 10) %>%
ungroup() %>%
pivot_wider(names_from = star_sent, values_from = n, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1)/(sum(.) + 1))) %>%
mutate(logratio = log(positive/negative)) %>%
arrange(desc(logratio))
word_ratios %>%
group_by(logratio < 0) %>%
slice_max(abs(logratio), n = 15) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_col(show.legend = FALSE) +
coord_flip() + ylab("log odds ratio (Positive/Negative)") +
scale_fill_manual(name = "", labels = c("Positive", "Negative"),values = c("dodgerblue", "orangered")) + theme_bw()
We can also can plot some wordclouds.
tidy_stem %>%
count(word) %>%
with(wordcloud(scale = c(5, 0.7), word, n, max.words = 100,
min.freq = 2, random.order = F, rot.per = 0.15, colors = brewer.pal(8, "Paired")))
# we use the words instead of the stems and the wordcloud2 package.
frame = tidy_text %>%
count(word, sort = T)
frame = data.frame(word = frame$word, freq = frame$n)
wordcloud2(frame, color = "skyblue")
We can show some of the previous plots also for bigrams. Let’s consider a new type of visualization. More precisely, if you are interested in the relationship between words, it is useful to consider a network (with also the “direction” of the link).
library(ggraph)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
##
## compose, simplify
## The following object is masked from 'package:tidyr':
##
## crossing
## The following object is masked from 'package:tibble':
##
## as_data_frame
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
tidy_big_stem <- prod_rev %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
mutate(word1 = wordStem(word1)) %>%
mutate(word2 = wordStem(word2))
bigram_counts = tidy_big_stem %>%
count(word1, word2, sort = TRUE)
bigram_graph <- bigram_counts %>%
filter(n >= 2) %>%
graph_from_data_frame()
set.seed(9265)
a <- grid::arrow(type = "closed", length = unit(0.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(1, "inches")) +
geom_node_point(color = "skyblue", size = 3) +
geom_node_text(aes(label =name), vjust = 1, hjust = 1) + theme_void()
cooc <- cooccurrence(tidy_lemma$lemma, relevant = tidy_lemma$upos %in% c("NOUN", "ADJ"), skipgram = 1)
head(cooc)
## term1 term2 cooc
## 1 bull fight 13
## 2 first novel 10
## 3 main character 9
## 4 first book 9
## 5 other book 8
## 6 lost generation 7
wordnetwork <- head(cooc, 15)
wordnetwork <- graph_from_data_frame(wordnetwork)
ggraph(wordnetwork, layout = "fr") +
geom_edge_link(aes(width = cooc,edge_alpha = cooc), edge_colour = "skyblue") +
geom_node_text(aes(label = name),col = "darkblue", size = 4) +
theme_void() + labs(title = "Words following one another",subtitle = "Nouns & Adjective")
cooc <- cooccurrence(x = subset(tidy_lemma, upos %in% c("NOUN", "ADJ")), term = "lemma", group = c("doc_id"))
head(cooc)
## term1 term2 cooc
## 1 book bull 285
## 2 book character 240
## 3 book novel 232
## 4 book time 163
## 5 book other 162
## 6 book man 155
wordnetwork <- head(cooc, 30)
wordnetwork <- graph_from_data_frame(wordnetwork)
ggraph(wordnetwork, layout = "fr") +
geom_edge_link(aes(width = cooc,edge_alpha = cooc), edge_colour = "skyblue") +
geom_node_text(aes(label = name),col = "darkblue", size = 4) +
theme(legend.position = "none") + theme_void() +
labs(title = "Cooccurrences within documents", subtitle = "Nouns & Adjective")
library(textplot)
textplot_dependencyparser(tidy_lemma %>%filter(doc_id == "1" & sentence_id == "1"))